bitkeeper revision 1.911.1.3 (40ac8592YPN8CVevw_ez5NasKUdPag)
authoriap10@labyrinth.cl.cam.ac.uk <iap10@labyrinth.cl.cam.ac.uk>
Thu, 20 May 2004 10:16:50 +0000 (10:16 +0000)
committeriap10@labyrinth.cl.cam.ac.uk <iap10@labyrinth.cl.cam.ac.uk>
Thu, 20 May 2004 10:16:50 +0000 (10:16 +0000)
live migration improvements

tools/examples/xc_dom_control.py
tools/xc/lib/xc_linux_restore.c
tools/xc/lib/xc_linux_save.c
tools/xc/py/Xc.c
xen/common/shadow.c
xen/include/hypervisor-ifs/dom0_ops.h

index 2da4ddaf7be64495edf3613766df6dcda5049fd7..9feab009e369d50cb786cefdde4f170bcdfa2f54 100755 (executable)
@@ -136,17 +136,7 @@ elif cmd == 'suspend':
         pid = int(fd.readline())
         os.kill(pid, signal.SIGTERM)
 
-    """
-    xc.domain_stop( dom=dom )
-    XXX
-    while not xc.domain_getinfo( first_dom=dom, max_doms=1 )[0]['stopped']:
-       print "Sleep..."
-       time.sleep(0.001);
-    """
-
     rc = xc.linux_save( dom=dom, state_file=file, progress=1)
-    if rc == 0 : xc.domain_destroy( dom=dom, force=1 )
-    else: xc.domain_start( dom=dom )  # sensible for production use
 
 elif cmd == 'cpu_bvtslice':
     if len(sys.argv) < 3:
index d66e22fd0a6c8e11c7cdf0d17451be532a3b90de..4e89b5715f8ef14041f2116db5a82c8ab2479fe6 100644 (file)
@@ -592,14 +592,11 @@ int xc_linux_restore(int xc_handle,
 
 
  out:
-    if ( rc != 0 )  // destroy is something went wrong
+    if ( rc != 0 )  // destroy if something went wrong
     {
         if ( dom != 0 )
         {
-            op.cmd = DOM0_DESTROYDOMAIN;
-            op.u.destroydomain.domain = (domid_t)dom;
-            op.u.destroydomain.force  = 1;
-            (void)do_dom0_op(xc_handle, &op);
+           xc_domain_destroy( xc_handle, dom, 1 );
         }
     }
 
index 64625c53f66f0d681e0510021dba4a0c341d7626..8bcd207d7f7f8bbee427fc41c00c33abc5477c27 100644 (file)
@@ -95,7 +95,7 @@ int xc_linux_save(int xc_handle,
     int verbose = flags & XCFLAGS_VERBOSE;
     int live = flags & XCFLAGS_LIVE;
     int debug = flags & XCFLAGS_DEBUG;
-    int sent_last_iter, sent_this_iter, max_iters;
+    int sent_last_iter, sent_this_iter, skip_this_iter, max_iters;
 
     /* Remember if we stopped the guest, so we can restart it on exit. */
     int we_stopped_it = 0;
@@ -137,8 +137,11 @@ int xc_linux_save(int xc_handle,
     /* number of pages we're dealing with */
     unsigned long nr_pfns;
 
-    /* bitmap of pages left to send */
-    unsigned long *to_send, *to_fix;
+    /* bitmap of pages:
+       - that should be sent this iteration (unless later marked as skip); 
+       - to skip this iteration because already dirty;
+       - to fixup by sending at the end if not already resent; */
+    unsigned long *to_send, *to_skip, *to_fix;
 
     int needed_to_fix = 0;
     int total_sent    = 0;
@@ -289,7 +292,7 @@ int xc_linux_save(int xc_handle,
 
        last_iter = 0;
        sent_last_iter = 1<<20; // 4GB's worth of pages
-       max_iters = 9; // limit us to 10 time round loop
+       max_iters = 19; // limit us to 20 times round loop
     }
     else
        last_iter = 1;
@@ -301,12 +304,14 @@ int xc_linux_save(int xc_handle,
        
        to_send = malloc( sz );
        to_fix  = calloc( 1, sz );
+       to_skip = malloc( sz );
 
-       if (!to_send || !to_fix)
+       if (!to_send || !to_fix || !to_skip)
        {
            ERROR("Couldn't allocate to_send array");
            goto out;
        }
+
        memset( to_send, 0xff, sz );
 
        if ( mlock( to_send, sz ) )
@@ -314,6 +319,15 @@ int xc_linux_save(int xc_handle,
            PERROR("Unable to mlock to_send");
            return 1;
        }
+
+       /* (to fix is local only) */
+
+       if ( mlock( to_skip, sz ) )
+       {
+           PERROR("Unable to mlock to_skip");
+           return 1;
+       }
+
     }
 
 
@@ -379,6 +393,7 @@ int xc_linux_save(int xc_handle,
        iter++;
 
        sent_this_iter = 0;
+       skip_this_iter = 0;
        prev_pc = 0;
        verbose_printf("Saving memory pages: iter %d   0%%", iter);
 
@@ -392,6 +407,18 @@ int xc_linux_save(int xc_handle,
                prev_pc = this_pc;
            }
 
+           /* slightly wasteful to peek the whole array evey time, 
+              but this is fast enough for the moment. */
+
+           if ( !last_iter && 
+                xc_shadow_control( xc_handle, domid, 
+                                   DOM0_SHADOW_CONTROL_OP_PEEK,
+                                   to_skip, nr_pfns ) != nr_pfns ) 
+           {
+               ERROR("Error peeking shadow bitmap");
+               goto out;
+           }
+           
 
            /* load pfn_type[] with the mfn of all the pages we're doing in
               this batch. */
@@ -405,15 +432,29 @@ int xc_linux_save(int xc_handle,
                            test_bit(n,to_send),
                            live_mfn_to_pfn_table[live_pfn_to_mfn_table[n]&0xFFFFF]);
 
+               if (!last_iter && test_bit(n, to_send) && test_bit(n, to_skip))
+                   skip_this_iter++; // stats keeping
+
+               if (! ( (test_bit(n, to_send) && !test_bit(n, to_skip)) ||
+                       (test_bit(n, to_send) && last_iter) ||
+                       (test_bit(n, to_fix)  && last_iter) )   )
+                   continue;
 
-               if ( !test_bit(n, to_send ) &&
-                   !( last_iter && test_bit(n, to_fix ) ) ) continue;
+               /* we get here if:
+                  1. page is marked to_send & hasn't already been re-dirtied
+                  2. (ignore to_skip in last iteration)
+                  3. add in pages that still need fixup (net bufs)
+                */
                
                pfn_batch[batch] = n;
                pfn_type[batch] = live_pfn_to_mfn_table[n];
 
                if( pfn_type[batch] == 0x80000004 )
                {
+                   /* not currently in pusedo-physical map -- set bit
+                      in to_fix that we must send this page in last_iter
+                      unless its sent sooner anyhow */
+
                    set_bit( n, to_fix );
                    if( iter>1 )
                        DDPRINTF("Urk! netbuf race: iter %d, pfn %lx. mfn %lx\n",
@@ -572,7 +613,8 @@ int xc_linux_save(int xc_handle,
 
        total_sent += sent_this_iter;
 
-       verbose_printf("\b\b\b\b100%% (%d pages)\n", sent_this_iter );
+       verbose_printf("\b\b\b\b100%% (pages sent= %d, skipped= %d )\n", 
+                      sent_this_iter, skip_this_iter );
        
        if ( last_iter )
        {
@@ -604,7 +646,8 @@ int xc_linux_save(int xc_handle,
        if ( live )
        {
            if ( ( sent_this_iter > (sent_last_iter * 0.95) ) ||
-                (iter >= max_iters) || (sent_this_iter < 10) )
+                (iter >= max_iters) || (sent_this_iter < 10) || 
+                (total_sent > nr_pfns*2) )
            {
                DPRINTF("Start last iteration\n");
                last_iter = 1;
@@ -685,14 +728,6 @@ int xc_linux_save(int xc_handle,
     munmap(live_shinfo, PAGE_SIZE);
 
 out:
-    /* Restart the domain if we had to stop it to save its state. */
-    if ( we_stopped_it )
-    {
-       printf("Restart domain\n");
-        op.cmd = DOM0_STARTDOMAIN;
-        op.u.startdomain.domain = (domid_t)domid;
-        (void)do_dom0_op(xc_handle, &op);
-    }
 
     if ( pfn_type != NULL )
         free(pfn_type);
index 7bb1d877bdda8a4447b69ad97896061a6e982486..8a6e3b22efa92a9878fd8baa3a84afe83066d65f 100644 (file)
@@ -214,6 +214,7 @@ static PyObject *pyxc_linux_save(PyObject *self,
        struct hostent *h;
        struct sockaddr_in s;
        int sockbufsize;
+       int rc = -1;
 
        int writerfn(void *fd, const void *buf, size_t count)
        {
@@ -257,12 +258,24 @@ static PyObject *pyxc_linux_save(PyObject *self,
        if ( xc_linux_save(xc->xc_handle, dom, flags, 
                            writerfn, (void*)sd) == 0 )
        {
-           close(sd);
-           Py_INCREF(zero);
-           return zero;
+           if ( read( sd, &rc, sizeof(int) ) != sizeof(int) )
+               goto serr;
+               
+           if ( rc == 0 )
+           {
+               printf("Migration succesful -- destroy local copy\n");
+               xc_domain_destroy( xc->xc_handle, dom, 1 );
+               close(sd);
+               Py_INCREF(zero);
+               return zero;
+           }
+           else
+               errno = rc;
        }
 
     serr:
+       printf("Migration failed -- restart local copy\n");
+       xc_domain_start( xc->xc_handle, dom );
        PyErr_SetFromErrno(xc_error);
        if ( sd >= 0 ) close(sd);
        return NULL;
@@ -355,7 +368,7 @@ static PyObject *pyxc_linux_restore(PyObject *self,
        struct sockaddr_in s, d, p;
        socklen_t dlen, plen;
        int sockbufsize;
-       int on = 1;
+       int on = 1, rc = -1;
 
        int readerfn(void *fd, void *buf, size_t count)
        {
@@ -413,13 +426,18 @@ static PyObject *pyxc_linux_restore(PyObject *self,
                         sizeof sockbufsize) < 0 ) 
            goto serr;
 
-       if ( xc_linux_restore(xc->xc_handle, dom, flags, 
-                              readerfn, (void*)sd, &dom) == 0 )
+       rc = xc_linux_restore(xc->xc_handle, dom, flags, 
+                              readerfn, (void*)sd, &dom);
+
+       write( sd, &rc, sizeof(int) ); 
+
+       if (rc == 0)
        {
            close(sd);
            Py_INCREF(zero);
            return zero;
        }
+       errno = rc;
 
     serr:
        PyErr_SetFromErrno(xc_error);
index f82502d2b4121e3157d42e6324594c3bd3833589..62081df926682e46361704dc40082299d13bc224 100644 (file)
@@ -110,10 +110,10 @@ static void __free_shadow_table( struct mm_struct *m )
 }
 
 static inline int shadow_page_op( struct mm_struct *m, unsigned int op,
-                                  struct pfn_info *spfn_info )
+                                  struct pfn_info *spfn_info, int *work )
 {
-    int work = 0;
     unsigned int spfn = spfn_info-frame_table;
+       int restart = 0;
 
     switch( op )
     {
@@ -129,7 +129,7 @@ static inline int shadow_page_op( struct mm_struct *m, unsigned int op,
             {                    
                 if ( (spl1e[i] & _PAGE_PRESENT ) && (spl1e[i] & _PAGE_RW) )
                 {
-                    work++;
+                    *work++;
                     spl1e[i] &= ~_PAGE_RW;
                 }
             }
@@ -138,14 +138,36 @@ static inline int shadow_page_op( struct mm_struct *m, unsigned int op,
     }
        break;
 
+    case DOM0_SHADOW_CONTROL_OP_CLEAN2:
+    {
+        if ( (spfn_info->type_and_flags & PGT_type_mask) == 
+             PGT_l1_page_table )
+        {
+                       delete_shadow_status( m, frame_table-spfn_info );
+                       restart = 1; // we need to go to start of list again
+               }
+               else if ( (spfn_info->type_and_flags & PGT_type_mask) == 
+             PGT_l2_page_table )
+               {
+                       unsigned long * spl1e = map_domain_mem( spfn<<PAGE_SHIFT );
+                       memset( spl1e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*spl1e) );
+                       unmap_domain_mem( spl1e );
+               }
+               else
+                       BUG();
+    }
+       break;
+
+
+
     }
-    return work;
+    return restart;
 }
 
 static void __scan_shadow_table( struct mm_struct *m, unsigned int op )
 {
     int j, work=0;
-    struct shadow_status *a;
+    struct shadow_status *a, *next;
  
     // the code assumes you're not using the page tables i.e.
     // the domain is stopped and cr3 is something else!!
@@ -156,16 +178,25 @@ static void __scan_shadow_table( struct mm_struct *m, unsigned int op )
 
     for(j=0;j<shadow_ht_buckets;j++)
     {
-        a = &m->shadow_ht[j];        
+       retry:
+        a = &m->shadow_ht[j];     
+               next = a->next;
         if (a->pfn)
         {
-            work += shadow_page_op( m, op, &frame_table[a->spfn_and_flags & PSH_pfn_mask] );
+            if ( shadow_page_op( m, op, 
+                                                       &frame_table[a->spfn_and_flags & PSH_pfn_mask], 
+                                                       &work ) )
+                               goto retry;
         }
-        a=a->next;
+        a=next;
         while(a)
         { 
-            work += shadow_page_op( m, op, &frame_table[a->spfn_and_flags & PSH_pfn_mask] );
-            a=a->next;
+                       next = a->next;
+            if ( shadow_page_op( m, op, 
+                                                       &frame_table[a->spfn_and_flags & PSH_pfn_mask],
+                                                       &work ) )
+                               goto retry;
+            a=next;
         }
         shadow_audit(m,0);
     }
@@ -304,7 +335,8 @@ static int shadow_mode_table_op( struct task_struct *p,
         __free_shadow_table( m );
         break;
    
-    case DOM0_SHADOW_CONTROL_OP_CLEAN:
+    case DOM0_SHADOW_CONTROL_OP_CLEAN:   // zero all-non hypervisor
+    case DOM0_SHADOW_CONTROL_OP_CLEAN2:  // zero all L2, free L1s
     {
                int i,j,zero=1;
                
@@ -418,7 +450,7 @@ int shadow_mode_control( struct task_struct *p, dom0_shadow_control_t *sc )
         if(p->mm.shadow_mode) shadow_mode_disable(p);
         shadow_mode_enable(p, SHM_logdirty);
     } 
-    else if ( p->mm.shadow_mode && cmd >= DOM0_SHADOW_CONTROL_OP_FLUSH && cmd<=DOM0_SHADOW_CONTROL_OP_PEEK )
+    else if ( p->mm.shadow_mode && cmd >= DOM0_SHADOW_CONTROL_OP_FLUSH && cmd<=DOM0_SHADOW_CONTROL_OP_CLEAN2 )
     {
         rc = shadow_mode_table_op(p, sc);
     }
index 113f6bb02b59d956f46f09e53969a580898ec753..58b1480525dae7167c9395e3c29fe9df9e4e2ce9 100644 (file)
@@ -236,9 +236,10 @@ typedef struct dom0_sched_id_st
 #define DOM0_SHADOW_CONTROL_OP_OFF         0
 #define DOM0_SHADOW_CONTROL_OP_ENABLE_TEST 1
 #define DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY 2
-#define DOM0_SHADOW_CONTROL_OP_FLUSH       10
+#define DOM0_SHADOW_CONTROL_OP_FLUSH       10     /* table ops */
 #define DOM0_SHADOW_CONTROL_OP_CLEAN       11
 #define DOM0_SHADOW_CONTROL_OP_PEEK        12
+#define DOM0_SHADOW_CONTROL_OP_CLEAN2      13
 typedef struct dom0_shadow_control_st
 {
     /* IN variables. */